1. Data Loading and Wrangling
1.1 Loading all required libraries on RStudio
library(tidyverse)
library(here) # for the upkeep of the directory structure
library(janitor) # for data cleaning
library(lubridate)
library(gt) # for table building
library(sf) # for geospatial visualisation
library(ggplot2)
library(ggtext)
library(patchwork)
library(plotly)
library(cowplot)
library(stringr)
1.2 Loading all Health Board Data (January 2024:June 2025) at
once
Utilising janitor package by using the
clean_names() function to have uniform names throughout
datasets once the all prescription datasets are loaded.
urls_prescr <- list(
prescr_jan_june_2024 <- "https://www.opendata.nhs.scot/dataset/84393984-14e9-4b0d-a797-b288db64d088/resource/f0df380b-3f9b-4536-bb87-569e189b727a/download/hb_pitc2024_01_06-1.csv",
prescr_july_dec_2024 <- "https://www.opendata.nhs.scot/dataset/84393984-14e9-4b0d-a797-b288db64d088/resource/f3b9f2e2-66c0-4310-9b8e-734781d2ed0a/download/hb_pitc2024_07_12-1.csv",
prescr_jan_june_2025 <- "https://www.opendata.nhs.scot/dataset/84393984-14e9-4b0d-a797-b288db64d088/resource/9de908b3-9c28-4cc3-aa32-72350a0579d1/download/hb_pitc2025_01_06.csv")
# Reads all Health Board prescription data in a loop to avoid repetition
prescr_list <- map(urls_prescr,
~read_csv(.x) %>%
clean_names())
# Ainds together everything in a single tibble
prescr_raw <- bind_rows(prescr_list, .id = "source_file") %>%
mutate(paid_date_month = str_trim(as.character(paid_date_month))) %>%
select(-source_file)
glimpse(prescr_raw)
## Rows: 2,249,380
## Columns: 9
## $ hbt <chr> "S08000015", "S08000015", "S08000015", "S0800001…
## $ dmd_code <dbl> 1.001011e+15, 1.001411e+15, 1.001811e+15, 1.0018…
## $ bnf_item_code <chr> "0603020J0AAAEAE", "1001010P0AAAHAH", "1310012F0…
## $ bnf_item_description <chr> "HYDROCORTISONE 20MG TABLETS", "NAPROXEN 250MG G…
## $ prescribed_type <chr> "VMP", "VMP", "VMP", "VMPP", "VMP", "VMPP", "VMP…
## $ number_of_paid_items <dbl> 25, 53, 275, 1, 181, 2, 487, 1432, 66, 1, 1, 283…
## $ paid_quantity <dbl> 1244, 4046, 4695, 15, 25320, 240, 24924, 65820, …
## $ gross_ingredient_cost <dbl> 145.58, 187.17, 1111.15, 3.55, 4093.40, 38.80, 5…
## $ paid_date_month <chr> "202401", "202401", "202401", "202401", "202401"…
1.3 Selecting only antidepressant medication prescriptions
Filtering out by “bnf_item_code” to keep only those prescriptions
that are antidepressants (codes starting with ‘0403’) and that were
prescribed within investigation window.
# Only keeping antidepressant codes and aggregating prescriptions per HB per month
prescr_monthly <- prescr_raw %>%
filter(!is.na(bnf_item_code)) %>%
filter(str_detect(bnf_item_code, "^0403")) %>%
mutate(paid_date_month = as.integer(paid_date_month)) %>%
group_by(hbt, paid_date_month) %>%
summarise(number_of_items = sum(number_of_paid_items, na.rm = TRUE)) %>%
arrange(paid_date_month)
# Subsetting to fit our investigation window
prescr_monthly <- prescr_monthly %>%
filter(paid_date_month >= 202403, paid_date_month <= 202502)
prescr_monthly %>%
summarise(rows = n(), min_month = min(paid_date_month), max_month = max(paid_date_month))
## # A tibble: 15 × 4
## hbt rows min_month max_month
## <chr> <int> <int> <int>
## 1 S08000015 12 202403 202502
## 2 S08000016 12 202403 202502
## 3 S08000017 12 202403 202502
## 4 S08000019 12 202403 202502
## 5 S08000020 12 202403 202502
## 6 S08000022 12 202403 202502
## 7 S08000024 12 202403 202502
## 8 S08000025 12 202403 202502
## 9 S08000026 12 202403 202502
## 10 S08000028 12 202403 202502
## 11 S08000029 12 202403 202502
## 12 S08000030 12 202403 202502
## 13 S08000031 12 202403 202502
## 14 S08000032 12 202403 202502
## 15 SB0806 1 202412 202412
1.4 Creating seasonal categories
spr_months_202425 <- c(202403, 202404, 202405)
sum_months_202425 <- c(202406, 202407, 202408)
aut_months_202425 <- c(202409, 202410, 202411)
win_months_202425 <- c(202412, 202501, 202502)
seasons_202425 <- tibble(
paid_date_month = c(spr_months_202425, sum_months_202425, aut_months_202425, win_months_202425),
season = c(rep("Spring", length(spr_months_202425)),
rep("Summer", length(sum_months_202425)),
rep("Autumn", length(aut_months_202425)),
rep("Winter", length(win_months_202425))))
1.5 Allocating Met Office regional categorisations to all NHS
Scottish Health Boards
# Health Board official NHS names list
hb_names <- read_csv("https://www.opendata.nhs.scot/dataset/9f942fdb-e59e-44f5-b534-d6e17229cc7b/resource/652ff726-e676-4a20-abda-435b98dd7bdc/download/hb14_hb19.csv") %>%
clean_names()
# Met Office regional mapping
north_hb <- c("NHS Highland", "NHS Western Isles", "NHS Orkney", "NHS Shetland")
east_hb <- c("NHS Borders", "NHS Lothian", "NHS Fife", "NHS Tayside", "NHS Grampian", "NHS Forth Valley")
west_hb <- c("NHS Ayrshire and Arran", "NHS Dumfries and Galloway", "NHS Greater Glasgow and Clyde", "NHS Lanarkshire")
metoffice_hb_region <- tibble(
hb_name = c(north_hb, east_hb, west_hb),
region = c(rep("North", length(north_hb)),
rep("East", length(east_hb)),
rep("West", length(west_hb))))
# Health Boards per region according to Met Office scottish territorial classifications
hb_regional <- hb_names %>%
full_join(metoffice_hb_region, by = "hb_name") %>%
select(-c(hb_date_archived, hb_date_archived, hb_date_enacted, country))
1.6 Loading and organising NHS Health Board population data
From October 2024 data file. The data file chosen is deliberate as
October 2024 marks approximately half-way of the investigation window.
hb_pop <- read_csv("https://www.opendata.nhs.scot/dataset/e3300e98-cdd2-4f4e-a24e-06ee14fcc66c/resource/cec9341e-ccba-4c71-afe4-a614f5e97b9f/download/practice_listsizes_oct2024-open-data.csv") %>%
clean_names() %>%
select(hb, sex, all_ages) %>%
filter(!sex %in% c("Male", "Female")) %>%
group_by(hb) %>%
summarise(hb_population = sum(all_ages, na.rm = TRUE)) %>%
ungroup()
1.7 Joining prescriptions, Health Board, population, season and
regional categorisation to a single object.
prescr_seasonal <- prescr_monthly %>%
full_join(hb_regional %>%
select(hb, hb_name, region), by = join_by(hbt == hb)) %>%
full_join(hb_pop, by = join_by(hbt == hb)) %>%
full_join(seasons_202425, by = join_by(paid_date_month))
1.8 Notes at this stage:
- After the
full_join() there are 4 rows with NA in
prescription data and population (paid_date_month, number_or_items, and
hb_population). Upon further investigation and a look at the object
hb_names, these were shown to have had their hbt numbers archived in
2018 and 2019, making them easily removable from our data.
- There is a row with hbt ‘SB0806’ which shows NA for hb_name though
has antidepressant prescription data for December 2024 only. Given the
value (‘2’) is extremely low and this hbt is not on record or appears
any other time, it has been decided to be excluded from the
analysis.
prescr_seasonal <- prescr_seasonal %>%
filter(!is.na(hb_name)) %>%
filter(!is.na(paid_date_month))
# checking if there is a missing region or population
prescr_seasonal %>%
filter(is.na(region) | is.na(hb_population)) # tibble of 0 x 7 shows we have eliminated all NAs
## # A tibble: 0 × 7
## # Groups: hbt [0]
## # ℹ 7 variables: hbt <chr>, paid_date_month <dbl>, number_of_items <dbl>,
## # hb_name <chr>, region <chr>, hb_population <dbl>, season <chr>
1.9 Population Weighting for NHS Health Board data
A calculation of items_per_1000_people was be made to allow for
population weighting of prescription items. This is because all NHS
Health Boards have different populations, thus, comparing their
“number_of_items” solely would be affected by population sizes.
prescr_seasonal_standard <- prescr_seasonal %>%
mutate(items_per_1000 = (number_of_items/hb_population)*1000)
1.10 Introducing Met Office UK daylight hours data and creating a
new function
This is somewhat challenging. Given the nature of the files the Met
Office has available (.txt), I converted them into .csv files using
Excel. These files can be found in the docs\data folder
attached.. Upon inspection of the data, one can notice that there are
specific columns for each season apart from one for each month. This
report uses seasonal data to make the merging processes easier. Building
a function had to be done to avoid repeting the same wrangling for each
region.
Note: these .csv files contain columns named spr,
sum, aut, win and a year column
for each year and season
# Reading all CSV files first
daylight_north <- read_table(here("docs", "data", "R_north_scotland_sunshine.csv")) %>%
clean_names()
daylight_east <- read_table(here("docs","data", "R_east_scotland_sunshine.csv")) %>%
clean_names()
daylight_west <- read_table(here("docs","data","R_west_scotland_sunshine.csv")) %>%
clean_names()
# Making a function to avoid code repetition for each .csv file
daylight_season_function <- function(data, region_name, year_filter, season_cols = c("win", "spr", "sum", "aut"), year_cols = c("year_12", "year_13", "year_14", "year_15"), full_season_names = c("Winter", "Spring", "Summer", "Autumn")) {
# built-in checker
if(length(season_cols)!= length(year_cols)) stop("season_cols and year_cols need to have the same length")
if(length(season_cols)!= length(full_season_names)) stop("full_season_names and season_cols need to have the same length")
# processing each individual season
season_list <- map2(season_cols, year_cols, ~ {
data %>%
select(all_of(.x), all_of(.y)) %>%
filter(.data[[.y]] == year_filter) %>%
rename(year = all_of(.y))
})
# joining all four seasons together
season_complete <- reduce(season_list, full_join, by = "year") %>%
relocate(all_of(season_cols), .after = last_col()) %>%
mutate(across(all_of(season_cols), as.numeric)) %>%
pivot_longer(cols = all_of(season_cols), names_to = "season", values_to = "daylight_hrs") %>%
mutate(season = recode(season, !!!set_names(full_season_names, season_cols)),
region = region_name) %>%
arrange(year, factor(season, levels = full_season_names)) %>%
filter(!is.na(daylight_hrs))
return(season_complete)
}
1.11 Using the daylight_season_function
daylight_season_function() was used to wrangle and
select specific data from each Met Office UK regional daylight dataset.
all_seasons_daylight contains all daylight hours per season
per region for the duration of the investigation window.
# Using the function for each region and making sure the "year" category is a character
season_daylight_north <- daylight_season_function(daylight_north, "North", "2024")
season_daylight_north <- season_daylight_north %>%
mutate(year = as.character(year))
season_daylight_east <- daylight_season_function(daylight_east, "East", "2024")
season_daylight_east <- season_daylight_east %>%
mutate(year = as.character(year))
season_daylight_west <- daylight_season_function(daylight_west, "West", "2024")
season_daylight_west <- season_daylight_west %>%
mutate(year = as.character(year))
# Merging all regional daylight data for each season
all_seasons_daylight <- bind_rows(season_daylight_north, season_daylight_east, season_daylight_west)
all_seasons_daylight
## # A tibble: 12 × 4
## year season daylight_hrs region
## <chr> <chr> <dbl> <chr>
## 1 2024 Winter 122. North
## 2 2024 Spring 375. North
## 3 2024 Summer 330. North
## 4 2024 Autumn 230. North
## 5 2024 Winter 164. East
## 6 2024 Spring 362. East
## 7 2024 Summer 453. East
## 8 2024 Autumn 269. East
## 9 2024 Winter 137. West
## 10 2024 Spring 367. West
## 11 2024 Summer 403. West
## 12 2024 Autumn 230. West
1.12. Loading and merging SIMD ranking data
The Scottish Index of Multiple Deprivation (SIMD) data was merged,
wrangled and saved for later analysis.
# Median rank per Health Board was used to summarise the distribution of deprivation
simd_raw <- read_csv("https://www.opendata.nhs.scot/dataset/78d41fa9-1a62-4f7b-9edb-3e8522a93378/resource/acade396-8430-4b34-895a-b3e757fa346e/download/simd2020v2_22062020.csv") %>%
clean_names()
simd_hb <- simd_raw %>%
select(hb, simd2020v2rank) %>%
group_by(hb) %>%
summarise(SIMD_median_rank = median(simd2020v2rank, na.rm = TRUE)) %>%
ungroup()
# Joining daylight, SIMD and seasonal prescription data
analysis_prescr_simd <- prescr_seasonal_standard %>%
full_join(all_seasons_daylight, by = join_by(season, region)) %>%
full_join(simd_hb, by = join_by(hbt == hb))
# Checking if any NAs are present
analysis_prescr_simd %>%
summarise(missing_daylight = sum(is.na(daylight_hrs)), missing_simd = sum(is.na(SIMD_median_rank))) # result should show a 14 x 3 tibble with value zero (0) for each
## # A tibble: 14 × 3
## hbt missing_daylight missing_simd
## <chr> <int> <int>
## 1 S08000015 0 0
## 2 S08000016 0 0
## 3 S08000017 0 0
## 4 S08000019 0 0
## 5 S08000020 0 0
## 6 S08000022 0 0
## 7 S08000024 0 0
## 8 S08000025 0 0
## 9 S08000026 0 0
## 10 S08000028 0 0
## 11 S08000029 0 0
## 12 S08000030 0 0
## 13 S08000031 0 0
## 14 S08000032 0 0
1.13. Loading and merging Healthboard geospatial data
The NHS Health Boards shapefile was merged, wrangled and saved for
geospatial analysis.
hb_shp_geo <- st_read(here("docs","data", "Week6_NHS_healthboards_2019.shp")) %>%
clean_names()
## Reading layer `Week6_NHS_healthboards_2019' from data source
## `/Users/florenciasolorzano/Documents/data_science/B218332/docs/data/Week6_NHS_healthboards_2019.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 14 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 7564.996 ymin: 530635.8 xmax: 468754.8 ymax: 1218625
## Projected CRS: OSGB36 / British National Grid
analysis_geo_prescr <- analysis_prescr_simd %>%
group_by(hbt, season, SIMD_median_rank) %>%
summarise(av_items_per_1000 = mean(items_per_1000, na.rm = TRUE), av_daylight = mean(daylight_hrs)) %>%
ungroup() %>%
full_join(hb_shp_geo, by = join_by(hbt == hb_code)) %>%
st_as_sf()
2. Data Analysis - Results
A seasonal summary table by region using gt():
full_seasonal_table <- analysis_prescr_simd %>%
group_by(region, season) %>%
summarise(av_daylight = mean(daylight_hrs, na.rm = TRUE), av_items_per_1000 = mean(items_per_1000, na.rm = TRUE)) %>%
ungroup() %>%
arrange(region, factor(season, levels = c("Spring", "Summer", "Autumn", "Winter")))
full_seasonal_table %>%
mutate(av_items_per_1000 = round(av_items_per_1000, 2),
av_daylight = round(av_daylight, 2)) %>%
gt(groupname_col = "region") %>%
cols_label(
season = md("Season"),
av_daylight = md("Mean Total Daylight (hrs)"),
av_items_per_1000 = md("Mean Prescriptions (units/1000 people)")) %>%
tab_header(
title = md("Antidepressant Prescriptions per 1000 and Total Daylight hours by region"),
subtitle = "March 1st, 2024 - February 28th, 2025") %>%
fmt_number(columns = c(av_items_per_1000, av_daylight), decimals = 2)
| Antidepressant Prescriptions per 1000 and Total Daylight hours by region |
| March 1st, 2024 - February 28th, 2025 |
| Season |
Mean Total Daylight (hrs) |
Mean Prescriptions (units/1000 people) |
| East |
| Spring |
361.80 |
113.06 |
| Summer |
452.60 |
113.84 |
| Autumn |
268.60 |
113.79 |
| Winter |
163.60 |
114.51 |
| North |
| Spring |
374.60 |
121.67 |
| Summer |
330.40 |
120.00 |
| Autumn |
229.70 |
121.45 |
| Winter |
121.90 |
121.96 |
| West |
| Spring |
367.40 |
137.25 |
| Summer |
403.10 |
138.44 |
| Autumn |
230.30 |
138.47 |
| Winter |
137.20 |
139.04 |
This table shows the mean seasonal total daylight hours and the mean
antidepressant prescriptions per 1000 population for each region. This
is the numeric anchor for the following data visualisations: a) regional
dual bar seasonal plot b) seasonal heat map c) deprivation vs
prescribing scatter plot
3. Data Visualisation
3.1. Figure 1: Regional Dual Bar Seasonal Plot
Figure 1 shows a dual bar chart graph displaying antidepressant
prescriptions per 1000 people proportional to each NHS Health Board
population and average daylight hours per month from March 2024 to March
2025, faceted by Scottish Geographical Region (North, East, and
West).
dual_bar_analysis <- analysis_prescr_simd %>%
group_by(region, season) %>%
summarise(av_items_per_1000 = mean(items_per_1000, na.rm = TRUE), av_daylight = mean(daylight_hrs, na.rm = TRUE)) %>%
ungroup() %>%
mutate(season = factor(season, levels = c("Spring", "Summer", "Autumn", "Winter"))) %>%
pivot_longer(cols = c(av_daylight, av_items_per_1000),
names_to = "variable",
values_to = "value")
dual_bar_plot <- dual_bar_analysis %>%
ggplot(aes(x = season, y = value, fill = variable, text = ifelse(
variable == "av_daylight",
paste0("Daylight (hrs): ", round(value, 2)),
paste0("Prescriptions: ", round(value, 2))))) +
geom_col(position = position_dodge(width = 0.9), alpha = 0.8) +
facet_wrap(~region, nrow = 1, scales = "free_x") +
scale_y_continuous(
name = str_wrap("Average Total Daylight (hrs)", width = 30),
breaks = seq(0, max(dual_bar_analysis$value), 50),
sec.axis = sec_axis(~ ., name = str_wrap("Average Antidepressant Prescriptions (units/1000 people)", width = 35), breaks = seq(0, max(dual_bar_analysis$value), 50))) +
scale_fill_manual(values = c("orange", "skyblue"),
labels = c("Daylight (hrs)", "Prescriptions (units/1000)")) +
labs(
title = str_wrap("Average seasonal total daylight hours and average antidepressant prescription items by region", width = 63),
subtitle = "Prescriptions per 1000 people across Scottish Health Boards",
x = "Season",
fill = "") +
theme_minimal(base_size = 13) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 10),
strip.background = element_rect(fill = "gray90", color = NA),
strip.text = element_text(face = "bold", size = 12),
plot.title = element_text(face = "bold", size = 16, margin = margin(b = 5), hjust = 0.3),
plot.title.position = "panel",
plot.subtitle = element_text(face = "bold", size = 12, margin = margin(b = 10), hjust = 0.5),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
plot.margin = margin(t = 20, r = 10, b = 40, l = 10, unit = "pt"))
dual_bar_plot

Prescriptions seem to marginally increase from Spring to Winter.
However, summer seasons across regions don’t show the lowest average
antidepressant prescriptions. It seems that antidepressant prescription
trends increase as the seasonal year progresses.
3.3. Figure 3: Bivariate Map of the relatedness between SIMD
rankings and antidepressant prescriptions per season
Figure 3 shows geospatial visualisation of whether SIMD rankings and
antidepressant prescriptions are correlated.
# Bivariate map bins
biv_bins <- analysis_geo_prescr %>%
mutate(prescr_bin = ntile(av_items_per_1000,3),
simd_bin = ntile(SIMD_median_rank, 3),
biv_class = paste0(prescr_bin, "-", simd_bin))
# bivariate map bin colours
biv_palette <- c(
"1-1" = "#e8e8e8",
"2-1" = "#b8d6be",
"3-1" = "#64acbe",
"1-2" = "#d4b9da",
"2-2" = "#a5add3",
"3-2" = "#4a7bb7",
"1-3" = "#c994c7",
"2-3" = "#df65b0",
"3-3" = "#dd1c77")
# bivariate map bin matrix
biv_matrix <- expand.grid(prescr_bin = 1:3, simd_bin = 1:3) %>%
mutate(simd_bin = simd_bin, biv_class = paste0(prescr_bin, "-", simd_bin))
# bivariate map legend
biv_legend <- biv_matrix %>%
ggplot(aes(x = prescr_bin, y = simd_bin, fill = biv_class)) +
geom_tile(color = "white") +
scale_fill_manual(values = biv_palette, guide = "none") +
scale_y_continuous(breaks = 1:3, labels = c("Low", "", "High")) +
scale_x_continuous(breaks = 1:3, labels = c("Low", "", "High")) +
labs(
x = "Prescriptions",
y = "SIMD Rank") +
coord_fixed(ratio = 1)+
theme_minimal(base_size = 9) +
theme_void()+
theme(
axis.title = element_text(size = 8, face = "bold"),
axis.text = element_text(size = 6),
panel.grid = element_blank(),
plot.margin = margin(t = 0, r = 5, b = 0, l = 0))
# bivariate map
map_biv_solo <- biv_bins %>%
ggplot() +
geom_sf(aes(fill = biv_class), size = 0.1, colour = "white") +
scale_fill_manual(values = biv_palette, guide = "none") +
facet_wrap(~season, nrow = 1) +
labs(
title = str_wrap("Seasonal Antidepressant prescriptions comparison with Median Deprivation Rankings"),
subtitle = str_wrap("Prescriptions per 1000 people across Scottish NHS Health Boards | Scottish Index of Multiple Deprivation (SIMD) Rank"),
fill = "Bivariate classification") +
coord_sf(expand = FALSE) +
theme_void()+
theme(
plot.title = element_text(face = "bold", size = 14, margin = margin(r = 0.5, b = 0.5), hjust = 0.5),
plot.title.position = "panel",
plot.subtitle = element_text(size = 10, margin = margin(t = 10, b = 20), hjust = 1),
panel.spacing = unit(0.1, "lines"),
strip.text = element_text(face = "bold", size = 8),
plot.margin = margin(t = 0, r = 10, b = 0, l = 10, unit = "pt"))
full_bivariate_map <- plot_grid(map_biv_solo, biv_legend,
ncol = 2,
rel_widths = c(4, 1),
align = "t")
full_bivariate_map

Lower SIMD ranks indicate more deprived Health Board populations. The
general trend seems to be that the higher the SIMD rank, the lesser the
antidepressant prescriptions per Health Board. Only significant
differences have been shown when comparing summer and winter, which do
indicate that higher baseline prescribing is delivered more in deprived
areas during the season with least sunshine throughout the year. This is
also corroborated in the Figure 5 in Appendix
1.4 which has been included for interest.